5 Metagenomic data
Species-specific community analyses conducted to generate the data included in these analyses can be found in the annex.
5.1 Library complexity
Nonpareil estimate of the metagenomic complexity after removing host DNA.
all_data %>%
select(dataset,Extraction,nonpareil,Taxon) %>%
unique() %>%
group_by(Taxon,Extraction) %>%
summarise(value = sprintf("%.1f±%.1f", mean(nonpareil), sd(nonpareil))) %>%
pivot_wider(names_from = Extraction, values_from = value) %>%
tt(caption = "Mean and standard deviation of breadth of host genome coverage")| Taxon | ZYMO | DREX | EHEX |
|---|---|---|---|
| Amphibian | 0.9±0.1 | 0.8±0.1 | 0.8±0.1 |
| Reptile | 0.9±0.1 | 0.9±0.0 | 0.9±0.1 |
| Mammal | 0.8±0.2 | 0.8±0.1 | 0.7±0.3 |
| Bird | 0.9±0.1 | 1.0±0.0 | 0.8±0.4 |
| Control | 0.0±0.0 | 0.5±0.6 | 0.5±0.7 |
all_data %>%
select(dataset,Extraction,nonpareil,Taxon,Species) %>%
unique() %>%
ggplot(aes(x=Extraction,y=nonpareil, color=Species, group=Extraction)) +
geom_boxplot(outlier.shape = NA, fill="#f4f4f4", color="#8c8c8c") +
geom_jitter() +
scale_color_manual(values=vertebrate_colors) +
facet_grid(. ~ Taxon, scales = "free") +
theme_minimal() +
labs(y="Nonpareil completeness",x="Extraction method")all_data %>%
select(dataset,Extraction,Sample,Species,nonpareil,Taxon) %>%
unique() %>%
mutate(Extraction=factor(Extraction,levels=c("ZYMO","DREX","EHEX"))) %>%
filter(Taxon != "Control") %>%
lmerTest::lmer(nonpareil ~ Extraction + (1 | Sample) + (1 | Species), data = ., REML = FALSE) %>%
broom.mixed::tidy() %>%
tt()| effect | group | term | estimate | std.error | statistic | df | p.value |
|---|---|---|---|---|---|---|---|
| fixed | NA | (Intercept) | 0.885552853 | 0.03450229 | 25.6664948 | 40.89542 | 7.753071e-27 |
| fixed | NA | ExtractionDREX | 0.005536892 | 0.04336611 | 0.1276779 | 48.00020 | 8.989373e-01 |
| fixed | NA | ExtractionEHEX | -0.112193866 | 0.04336611 | -2.5871326 | 48.00020 | 1.276294e-02 |
| ran_pars | Sample | sd__(Intercept) | 0.059565487 | NA | NA | NA | NA |
| ran_pars | Species | sd__(Intercept) | 0.035030813 | NA | NA | NA | NA |
| ran_pars | Residual | sd__Observation | 0.150224598 | NA | NA | NA | NA |
5.2 Alpha diversity
Variance partitioning metrics are derived from community_analysis.Rmd.
alpha_data <- list.files(path = "results", pattern = "^alpha_.*\\.tsv$", full.names = TRUE) %>%
map_df(~ read_tsv(.)) %>%
left_join(all_data,by= join_by(dataset==dataset))alpha_data %>%
pivot_longer(!c(dataset,Library,Species,Taxon,Sample,Extraction), names_to = "metric", values_to = "value") %>%
filter(metric %in% c("richness","neutral","phylogenetic")) %>%
mutate(metric=factor(metric,levels=c("richness","neutral","phylogenetic"))) %>%
mutate(Taxon=factor(Taxon,levels=c("Amphibian","Reptile","Mammal"))) %>%
mutate(Extraction=factor(Extraction,levels=c("ZYMO","DREX","EHEX"))) %>%
unique() %>%
ggplot(aes(x=Extraction,y=value, color=Species, group=Extraction)) +
geom_boxplot(outlier.shape = NA, fill="#f4f4f4", color="#8c8c8c") +
geom_jitter() +
scale_color_manual(values=vertebrate_colors) +
facet_grid(metric ~ Taxon, scales = "free") +
theme_minimal() +
labs(y="Diversity",x="Extraction method")5.2.1 Richness
alpha_data %>%
select(dataset,Extraction,Sample,Species,richness,Taxon) %>%
unique() %>%
mutate(Extraction=factor(Extraction,levels=c("ZYMO","DREX","EHEX"))) %>%
lmerTest::lmer(richness ~ Extraction + (1 | Sample) + (1 | Species), data = ., REML = FALSE) %>%
broom.mixed::tidy() %>%
tt()| effect | group | term | estimate | std.error | statistic | df | p.value |
|---|---|---|---|---|---|---|---|
| fixed | NA | (Intercept) | 59.44444444 | 10.715517 | 5.5475108 | 10.12322 | 0.000234351 |
| fixed | NA | ExtractionDREX | 0.05555556 | 4.453352 | 0.0124750 | 34.97883 | 0.990117532 |
| fixed | NA | ExtractionEHEX | 1.38257472 | 4.545332 | 0.3041746 | 35.13134 | 0.762789322 |
| ran_pars | Sample | sd__(Intercept) | 16.00359332 | NA | NA | NA | NA |
| ran_pars | Species | sd__(Intercept) | 28.56742258 | NA | NA | NA | NA |
| ran_pars | Residual | sd__Observation | 13.36005511 | NA | NA | NA | NA |
5.2.2 Neutral
alpha_data %>%
select(dataset,Extraction,Sample,Species,neutral,Taxon) %>%
unique() %>%
mutate(Extraction=factor(Extraction,levels=c("ZYMO","DREX","EHEX"))) %>%
lmerTest::lmer(neutral ~ Extraction + (1 | Sample) + (1 | Species), data = ., REML = FALSE) %>%
broom.mixed::tidy() %>%
tt()| effect | group | term | estimate | std.error | statistic | df | p.value |
|---|---|---|---|---|---|---|---|
| fixed | NA | (Intercept) | 29.5413181 | 5.139162 | 5.7482754 | 9.895197 | 0.000193227 |
| fixed | NA | ExtractionDREX | -2.1102758 | 1.922343 | -1.0977625 | 35.042971 | 0.279794105 |
| fixed | NA | ExtractionEHEX | -0.3422373 | 1.962694 | -0.1743712 | 35.152526 | 0.862574163 |
| ran_pars | Sample | sd__(Intercept) | 8.6429526 | NA | NA | NA | NA |
| ran_pars | Species | sd__(Intercept) | 13.5543072 | NA | NA | NA | NA |
| ran_pars | Residual | sd__Observation | 5.7670282 | NA | NA | NA | NA |
5.2.3 Phylogenetic
alpha_data %>%
select(dataset,Extraction,Sample,Species,phylogenetic,Taxon) %>%
unique() %>%
mutate(Extraction=factor(Extraction,levels=c("ZYMO","DREX","EHEX"))) %>%
lmerTest::lmer(phylogenetic ~ Extraction + (1 | Sample) + (1 | Species), data = ., REML = FALSE) %>%
broom.mixed::tidy() %>%
tt()| effect | group | term | estimate | std.error | statistic | df | p.value |
|---|---|---|---|---|---|---|---|
| fixed | NA | (Intercept) | 4.7721984 | 0.4324985 | 11.034024 | 9.653749 | 8.685117e-07 |
| fixed | NA | ExtractionDREX | 0.1521100 | 0.1403528 | 1.083769 | 35.015284 | 2.858737e-01 |
| fixed | NA | ExtractionEHEX | 0.1852709 | 0.1433723 | 1.292236 | 35.056809 | 2.047290e-01 |
| ran_pars | Sample | sd__(Intercept) | 1.2179889 | NA | NA | NA | NA |
| ran_pars | Species | sd__(Intercept) | 0.9236345 | NA | NA | NA | NA |
| ran_pars | Residual | sd__Observation | 0.4210584 | NA | NA | NA | NA |
5.3 Microbial complexity recovery
all_data %>%
select(dataset,Extraction,microbial_fraction,MAG_mapping_percentage,Taxon) %>%
mutate(damr=pmin(1,MAG_mapping_percentage/(microbial_fraction*100))) %>%
mutate(damr=ifelse(is.na(damr),0,damr)) %>%
unique() %>%
group_by(Taxon,Extraction) %>%
summarise(value = sprintf("%.1f±%.1f", mean(damr), sd(damr))) %>%
pivot_wider(names_from = Extraction, values_from = value) %>%
tt(caption = "Mean and standard deviation of breadth of host genome coverage")| Taxon | ZYMO | DREX | EHEX |
|---|---|---|---|
| Amphibian | 0.8±0.4 | 0.8±0.3 | 0.8±0.3 |
| Reptile | 0.9±0.1 | 1.0±0.0 | 1.0±0.0 |
| Mammal | 0.8±0.1 | 0.9±0.1 | 0.8±0.3 |
| Bird | 0.6±0.4 | 0.5±0.4 | 0.6±0.4 |
| Control | 1.0±0.0 | 1.0±0.0 | 1.0±0.0 |
all_data %>%
select(dataset,Extraction,microbial_fraction,MAG_mapping_percentage,Taxon,Sample,Species) %>%
mutate(damr=pmin(1,MAG_mapping_percentage/(microbial_fraction*100))) %>%
mutate(damr=ifelse(is.na(damr),0,damr)) %>%
unique() %>%
ggplot(aes(x=Extraction,y=damr, color=Species, group=Extraction)) +
geom_boxplot(outlier.shape = NA, fill="#f4f4f4", color="#8c8c8c") +
geom_jitter() +
scale_color_manual(values=vertebrate_colors) +
facet_grid(. ~ Taxon, scales = "free") +
theme_minimal() +
labs(y="Domain-adjusted mapping rate",x="Extraction method")all_data %>%
select(dataset,Extraction,microbial_fraction,MAG_mapping_percentage,Taxon, Sample, Species) %>%
mutate(damr=pmin(1,MAG_mapping_percentage/(microbial_fraction*100))) %>%
mutate(damr=ifelse(is.na(damr),0,damr)) %>%
unique() %>%
mutate(Extraction=factor(Extraction,levels=c("ZYMO","DREX","EHEX"))) %>%
filter(Taxon != "Control") %>%
lmerTest::lmer(damr ~ Extraction + (1 | Sample) + (1 | Species), data = ., REML = FALSE) %>%
broom.mixed::tidy() %>%
tt()| effect | group | term | estimate | std.error | statistic | df | p.value |
|---|---|---|---|---|---|---|---|
| fixed | NA | (Intercept) | 0.78271978 | 0.06660971 | 11.7508366 | 15.45687 | 4.117675e-09 |
| fixed | NA | ExtractionDREX | 0.03348421 | 0.03977983 | 0.8417385 | 130.19072 | 4.014779e-01 |
| fixed | NA | ExtractionEHEX | 0.02228400 | 0.03999367 | 0.5571881 | 130.24817 | 5.783552e-01 |
| ran_pars | Sample | sd__(Intercept) | 0.12024342 | NA | NA | NA | NA |
| ran_pars | Species | sd__(Intercept) | 0.19047224 | NA | NA | NA | NA |
| ran_pars | Residual | sd__Observation | 0.20283813 | NA | NA | NA | NA |
5.4 Variance partitioning
Variance partitioning metrics are derived from community_analysis.Rmd.
variance_data <- list.files(path = "results", pattern = "^var_.*\\.tsv$", full.names = TRUE) %>%
map_df(~ read_tsv(.))
variance_data %>% summarise(mean=mean(r2),sd=sd(r2))# A tibble: 1 × 2
mean sd
<dbl> <dbl>
1 0.102 0.0768
variance_data %>%
left_join(all_data %>% select(Species,Taxon) %>% unique(),by=join_by(species==Species)) %>%
mutate(metric=factor(metric,levels=c("phylogenetic","neutral","richness"))) %>%
mutate(Taxon=factor(Taxon,levels=c("Amphibian","Reptile","Mammal"))) %>%
ggplot(aes(x=r2,y=metric)) +
geom_boxplot(outlier.shape = NA, fill="#f4f4f4", color="#8c8c8c") +
geom_jitter(aes(color=species))+
scale_color_manual(values=vertebrate_colors) +
xlim(0,0.5)+
theme_minimal() +
labs(y="Diversity metric",x="Explained variance")| metric | mean | sd |
|---|---|---|
| neutral | 0.06201935 | 0.04744380 |
| phylogenetic | 0.07381277 | 0.06884687 |
| richness | 0.16970818 | 0.06626388 |
5.5 Combined community analysis
species="combined"
genus=species
sample_metadata <- read_tsv(paste0("data/metadata.tsv")) %>%
rename(dataset=Dataset)
read_counts <- read_tsv("https://sid.erda.dk/share_redirect/BaMZodj9sA/DMB/DMB0134/DMB0134_counts.tsv.gz") %>%
rename(genome = 1)
genome_metadata <- read_tsv("https://sid.erda.dk/share_redirect/BaMZodj9sA/DMB/DMB0134/DMB0134_mag_info.tsv.gz") %>%
rename(genome = 1, length=mag_size)
genome_coverage <- read_tsv("https://sid.erda.dk/share_redirect/BaMZodj9sA/DMB/DMB0134/DMB0134_coverage.tsv.gz") %>%
rename(genome = 1)
download.file("https://sid.erda.dk/share_redirect/BaMZodj9sA/DMB/DMB0134/DMB0134.tree.gz", "data/DMB0134.tree.gz")
genome_tree <- read_tree("data/DMB0134.tree.gz")5.5.1 Filter data
#Filter by coverage
min_coverage=0.3
read_counts_filt <- genome_coverage %>%
mutate(across(where(is.numeric), ~ ifelse(. > min_coverage, 1, 0))) %>%
mutate(across(-1, ~ . * read_counts[[cur_column()]]))
# Transform to genome counts (non-filtered)
readlength=150
genome_counts <- read_counts %>%
mutate(across(where(is.numeric), ~ . / (genome_metadata$length / readlength) ))
# Transform to genome counts (coverage-filtered)
readlength=150
genome_counts_filt <- read_counts_filt %>%
mutate(across(where(is.numeric), ~ . / (genome_metadata$length / readlength) ))5.5.2 Community barplot
# Retrieve EHI taxonomy colors
phylum_colors <- read_tsv("https://raw.githubusercontent.com/earthhologenome/EHI_taxonomy_colour/main/ehi_phylum_colors.tsv") %>%
right_join(genome_metadata, by=join_by(phylum == phylum)) %>%
select(phylum, colors) %>%
unique() %>%
arrange(phylum) %>%
select(colors) %>%
pull()
# Stacked barplot
genome_counts %>%
mutate_at(vars(-genome), funs(./sum(., na.rm = TRUE))) %>% #apply TSS nornalisation
pivot_longer(-genome, names_to = "dataset", values_to = "count") %>%
left_join(., genome_metadata, by = join_by(genome == genome)) %>% #append taxonomy
left_join(., sample_metadata, by = join_by(dataset == dataset)) %>%
mutate(Taxon=factor(Taxon,levels=c("Amphibian",
"Reptile",
"Mammal",
"Bird",
"Control"))) %>%
mutate(Extraction=factor(Extraction,levels=c("ZYMO",
"DREX",
"EHEX"))) %>%
mutate(Species=factor(Species,levels=c("Calotriton asper",
"Lissotriton helveticus",
"Salamandra atra",
"Chalcides striatus",
"Natrix astreptophora",
"Podarcis muralis",
"Plecotus auritus",
"Sciurus carolinensis",
"Trichosurus vulpecula",
"Geospizopsis unicolor",
"Perisoreus infaustus",
"Zonotrichia capensis",
"Extraction control",
"Library control"))) %>%
filter(Taxon != "Control") %>%
ggplot(., aes(x=dataset,y=count,fill=phylum, group=phylum))+ #grouping enables keeping the same sorting of taxonomic units
geom_bar(stat="identity", colour="white", linewidth=0.1)+ #plot stacked bars with white borders
scale_fill_manual(values=phylum_colors) +
labs(y = "Relative abundance") +
facet_nested(. ~ Taxon + Species + Sample + Extraction, scales="free_x") +
guides(fill = guide_legend(ncol = 3)) +
theme(axis.text.x = element_text(angle = 90, vjust = 0.5, hjust=1),
axis.title.x = element_blank(),
panel.background = element_blank(),
panel.border = element_blank(),
panel.grid.major = element_blank(),
panel.grid.minor = element_blank(),
axis.line = element_line(linewidth = 0.5, linetype = "solid", colour = "black"),
legend.title=element_blank(),
panel.spacing = unit(0, "lines"))genome_counts_NMDS <- genome_counts_filt %>%
column_to_rownames(var="genome") %>%
select(where(~!all(. == 0))) %>%
hillpair(.,q=1, metric="C", out="dist") %>%
metaMDS(.,trymax = 999, k=2, trace=0) %>%
vegan::scores() %>%
as_tibble(., rownames = "dataset") %>%
left_join(sample_metadata, by = join_by(dataset == dataset)) %>%
filter(Taxon != "Control") %>%
group_by(Sample) %>%
mutate(sample_x=mean(NMDS1), sample_y=mean(NMDS2))
genome_counts_NMDS %>%
ggplot(., aes(x=NMDS1,y=NMDS2, color=Species, shape=Extraction)) +
scale_color_manual(values=vertebrate_colors) +
geom_point(size=3, alpha=0.8) +
geom_segment(aes(x=sample_x, y=sample_y, xend=NMDS1, yend=NMDS2), alpha=0.2) +
theme_classic() +
theme(legend.position="right", legend.box="vertical") +
guides(color=guide_legend(title="Species"))